%% Setup Model Data Structures 
%{
This functon takes data tables on students, options, applications and enrollment decisions and generates the main inputs to the model estimation. 

Key calculations 
-ExPost Feasible Choice Set: options that a student would have been assigned to had they put it on their list)
-Waitlist Feasible Choice Set: applications that were waitlisted but another lower ranked application later matriculated) 
-Truthfully Ranked Apps : applications that were within a small bandwidth of being assigned such that it is assumed no strategic skipping of options
takes place between these applications. 

-Structures:
 students: student year level observations 
 
%}
function [Model_ChoiceData,options,apps,students,enrollment]=setup_data(Years,Options,Students,Apps,Enrollment)

if isempty(Years)==1 || nargin==0
    Years=2010:2012;
end

%% Select subset of Years to work on
options=Options(ismember(Options.proceso,Years),:);
students=Students(ismember(Students.proceso,Years),:);
%clear Options Students
% Add choices 
if nargin>3
    
enrollment=Enrollment(ismember(Enrollment.proceso,Years),:);
apps=Apps(ismember(Apps.proceso,Years),:);
clear Apps Enrollment

for ii=1:length(enrollment.cod) 
if isnan(enrollment.cod(ii))
    enrollment.cod(ii)=enrollment.cod2012(ii);
end
end

end

nObs=size(apps,1);
nOptions=size(options,1);
nStudents=size(students,1);

fprintf('                                            \n')
fprintf('Number of Higher Education Options : %10.0f \n',nOptions)
fprintf('Number of Students                 : %10.0f \n',nStudents)
fprintf('Number of Applications             : %10.0f \n',nObs)

%% Create unique program identifier
options.program=options.FL_UNISEDE;
[~,rcod]=ismember(apps.cod,options.cod);
apps = apps(rcod~=0,:);
[~,rcod]=ismember(apps.cod,options.cod);
apps.program=options.program(rcod);


[~,rcod]=ismember(enrollment.cod,options.cod);
enrollment = enrollment(rcod ~= 0,:);


[~,rcod]=ismember(enrollment.cod,options.cod);
enrollment.program=options.program(rcod);
clear rcod

%% Organize Student Level Data

% Preallocate space for student-year level data
studentsApps=NaN(nStudents,10); % in order 
studentsAppsIndex=NaN(nStudents,10);
studentsTruthFullRank=zeros(nStudents,10); % indicator of which apps are within a window considered "truthful" 
studentsAssigned=zeros(nStudents,10);
studentsExPostWaitlistFeasible=zeros(nStudents,10);
studentsAppAdmitCutoffs=NaN(nStudents,10);
studentsAppWaitCutoffs=NaN(nStudents,10);

studentsPunt=NaN(nStudents,10);
studentsIndexIN=NaN(nStudents,1);


% We want to get the index in "options" for each application:
% We want to indicate index which applications are within "truthful bounds" for estimation with apps. 
% We want to indicate index which applications are "Waitlist Feasible" and "Assigned" option for estimation with enrollment

truthWindow=5000; % window of truthfull apps. 
maxdropWait=5000;
noapp=zeros(nStudents,length(Years));

tic
parfor i=1:nStudents
    r=find(students.proceso(i)==apps.proceso & students.rut(i)==apps.rut);
    if isempty(r)==1
        noapp(i)=1;
    else
        application_i=[apps.pref(r) apps.proceso(r) apps.cod(r) apps.sit(r) apps.lugar(r) apps.punt(r) apps.pred_index(r) apps.punt_lastAdmit(r) apps.punt_last_waitAdmit(r) apps.program(r) ];
        application_i=sortrows(application_i,1,'ascend');
        applength=size(application_i,1);
        temp_Punt = [application_i(:,7)' NaN(1,10-applength)];
        temp_Apps = [application_i(:,3)' NaN(1,10-applength)];
        temp_AppsAdmitCutoffs = [application_i(:,8)' NaN(1,10-applength)];
        temp_AppsWaitCutoffs = [application_i(:,9)' NaN(1,10-applength)];        
        studentsPunt(i,:) = temp_Punt;
        studentsApps(i,:) = temp_Apps;
        studentsAppAdmitCutoffs(i,:) = temp_AppsAdmitCutoffs;
        studentsAppWaitCutoffs(i,:) = temp_AppsWaitCutoffs;        
        
        safetyschool=0;nadmits=0; 
        for j=1:10
            if j>applength
               continue 
            end
            index_ij=find(application_i(j,2)==options.proceso & application_i(j,3)==options.cod);
            score=application_i(j,7);
            if ~isnan(application_i(j,6))
                score=application_i(j,6);
            end
           
            
            if ~isnan(index_ij)
                % Store application index
                studentsAppsIndex(i,j)=index_ij;
                
                % Truthfull rank is apps that were "close" to actual cutoff and feasible. Includes waitlist(25), admit(24), above(26)
                studentsTruthFullRank(i,j)=(abs(score-options.punt_lastAdmit(index_ij))<=truthWindow & options.index_cut(index_ij)<=score & options.psu_cut(index_ij)<=score & safetyschool==0);
                
                if application_i(j,4)==24 && studentsTruthFullRank(i,j)==0 
                % option is assigned, then it is truthfully ranked
                % safty school if admitted but not even close   
                   safetyschool=1;
                   studentsTruthFullRank(i,j)=true;   
                end
                
                % Waitlist Call feasible are options where someone lower ranked was later enrolled suggesting it was feasible to get off the waitlist.
                studentsExPostWaitlistFeasible(i,j)=(application_i(j,4)==25 & score>=application_i(j,9) & (abs(application_i(j,8)-score)<=maxdropWait));
                
                if nadmits==0
                    in=(application_i(j,4)==24);
                    if in==1
                        studentsIndexIN(i)=index_ij;
                        studentsAssigned(i,j)=1;
                        nadmits=1; % do not enter this condition anymore, only first admit
                    end
                    
                else
                    studentsAssigned(i,j)=0;
                end
                
            end
        end
    end
end
toc

students.Apps=studentsApps;
students.AppsIndex=studentsAppsIndex;
students.Punt=studentsPunt;
students.AppsAssigned=studentsAssigned;
students.AppsTruthFullRank=studentsTruthFullRank; % indicator of which apps are within a window considered "truthful" 
students.AppsExPostWaitlistFeasibles=studentsExPostWaitlistFeasible;
students.AppsAdmitCutoffs=studentsAppAdmitCutoffs;
students.AppWaitCutoffs=studentsAppWaitCutoffs;
students.NoApp=noapp;
students.IN=studentsIndexIN;


ChoiceIndexRank=NaN(size(students.AppsIndex));
for i=1:size(students.Apps,1)
truthfullApps=students.AppsIndex(i,:).*students.AppsTruthFullRank(i,:);

truthfullApps(isnan(truthfullApps) | truthfullApps==0)=[];
ChoiceIndexRank(i,1:length(truthfullApps))=truthfullApps;
end
students.ChoiceIndexRank=ChoiceIndexRank;




tic
indexStudentEnrollment=NaN(length(students.rut),1);
codStudentEnrollment=NaN(length(students.rut),1);
it=1;kk=1;
parfor ii=1:length(students.rut)
    
    r=find(students.proceso(ii)==enrollment.proceso & students.rut(ii)==enrollment.rut );
    
    if isempty(r)==0
        if length(r)>1
            r=r(1);
        end
        it=it+1;
        k=find(enrollment.cod(r)==options.cod & enrollment.proceso(r)==options.proceso);
        
        if isempty(k)==0
        indexStudentEnrollment(ii)=k;
        codStudentEnrollment(ii)=options.cod(indexStudentEnrollment(ii));
        else
        kk=kk+1;    
        end
    end
end
toc

students.indexEnrollment=indexStudentEnrollment;
students.codEnrollment=codStudentEnrollment;

    

%% Get Ex Post Feasible Set of Options

options.last_admit(options.punt_lastAdmit==0)=NaN;

tic
%---------Predict Index with rules and score-------------------------------

Index=students.nem*options.x_b_nem'+students.leng*options.x_b_psu_leng'+students.mate*options.x_b_psu_mate'...
    +students.hria*options.x_b_psu_hria'+students.cien*options.x_b_psu_cien'...
    -min(students.hria,students.cien)*(options.max_hria_cien.*options.x_b_psu_hria)';

AVE=students.leng*0.5+students.mate*0.5;
%--------------------------------------------------------------------------
toc

% Get Pseudo Cutoffs for Offplatform options 
punt_min=NaN(length(options.cod),1);punt_p5=NaN(length(options.cod),1);punt_p50=NaN(length(options.cod),1);
for j=1:length(options.cod)
   
    year=options.proceso(j);
    applications_cutoff=options.punt_lastAdmit(j);
    
    r=find(students.proceso==year & students.codEnrollment==options.cod(j));
    
    if isnan(r)==0
    punt_ij=Index(r,j);
    punt_min(j)=min(punt_ij);
    punt_p5(j)=prctile(punt_ij,5);
    punt_p50(j)=prctile(punt_ij,50);
    
    end
end

Cut=options.punt_lastAdmit';
Cut(isnan(Cut)==1)=punt_p5(isnan(Cut)==1);
Cut(isnan(Cut)==1 | Cut<45000)=45000;

options.platform=zeros(size(options.cod,1),1);
options.platform(options.cod<38000)=1;
options.platform(options.proceso==2012)=1;

tic
tempFeasibleAny=(students.proceso(:,1)==options.proceso' & Index>options.index_cut'*100 & AVE>options.psu_cut'); % same year, clears minimum score and minimum index
tempFeasibleOff=(repmat(options.platform'==0,size(Index,1),1)==1 & Index>=Cut & students.proceso(:,1)==options.proceso' & Index>options.index_cut'*100 & AVE>options.psu_cut' & options.proceso'<2012);
tempExpostFeasible=(Index>=options.punt_lastAdmit' & students.proceso(:,1)==options.proceso' & Index>options.index_cut'*100 & AVE>options.psu_cut' & options.platform'==1);
tempExpostWaitlistFeasible=(((Index+truthWindow>=options.punt_lastAdmit')) & students.proceso(:,1)==options.proceso' & Index>options.index_cut'*100 & AVE>options.psu_cut' & options.platform'==1);

A(1)=sum(sum(tempFeasibleOff(:,options.cod'>=38000& options.proceso'<2012)))/(size(tempFeasibleOff(:,options.cod'>=38000 & options.proceso'<2012),1)*size(tempFeasibleOff(:,options.cod'>=38000 & options.proceso'<2012),2));
A(2)=sum(sum(tempExpostFeasible(:,options.platform'==1)))/(size(tempExpostFeasible(:,options.platform'==1),1)*size(tempExpostFeasible(:,options.platform'==1),2));
A(3)=sum(sum(tempExpostWaitlistFeasible(:,options.platform'==1)))/(size(tempExpostWaitlistFeasible(:,options.platform'==1),1)*size(tempExpostWaitlistFeasible(:,options.platform'==1),2));


%  Make sure assigned are simulated feasible 
countWrong=0;countWrongWait=0;countWrongOff=0;
for i=1:size(tempExpostFeasible,1)
    
    if students.NoApp(i)==0 && isnan(students.IN(i))==0     
    countWrong=countWrong+(tempExpostFeasible(i,students.IN(i))==0);    
    tempExpostFeasible(i,students.IN(i))=1;
    end
    
    if students.NoApp(i)==0
    wlist=students.AppsIndex(i,students.AppsTruthFullRank(i,:)==1);
    if isempty(wlist)==0
        for j=1:length(wlist)
        countWrongWait=countWrongWait+sum(tempExpostWaitlistFeasible(i,wlist(j))==0);    
        tempExpostWaitlistFeasible(i,wlist(j))=1;
        end
    end
    end
    
    if isnan(students.indexEnrollment(i))==0
        if isnan(options.punt_lastAdmit(students.indexEnrollment(i)))==1
        countWrongOff=countWrongOff+sum(tempFeasibleOff(i,students.indexEnrollment(i))==0);    
        tempFeasibleOff(i,students.indexEnrollment(i))=1;
        end
    end
    
end

FeasibleAny=sparse(tempFeasibleAny);
FeasibleOff=sparse(tempFeasibleOff);
ExPostFeasible=sparse(tempExpostFeasible);
ExPostWaitlistFeasible=sparse(tempExpostWaitlistFeasible);

%fprintf('Feasible placements given rules and score data %10.5f \n',[counterRight/(counterRight+counterWrong)]);
clear tempFeasibleAny tempFeasibleOff tempExpostFeasible tempExpostWaitlistFeasible 


% tag everyone with zero feasible options or without mandatory tests or GPA
dropApp=(sum(ExPostFeasible,2)==0 | (students.leng==0 | students.mate==0 | students.nem==0));
sum(dropApp)
% drop whatever we want to drop right away (not line by line)
students(dropApp,:)=[];

FeasibleAny(dropApp,:)=[];
ExPostFeasible(dropApp,:)=[];
ExPostWaitlistFeasible(dropApp,:)=[];
FeasibleOff(dropApp,:)=[];
Index(dropApp,:)=[];


scholarships=students{:,{'s_tp','bea','jgm','nacional','nuevomilenio','hijoprofe','bicentenario','vocacionprofe','fondosolidario','edutecnica'}};
Model_ChoiceData.Scholarships=scholarships;        
scores=students{:,{'mate', 'leng'}};
scores(scores<300 & scores>0)=300;
scores=(scores-300)/100;
Model_ChoiceData.admitIN=students.IN;
Model_ChoiceData.Scores=scores;
Model_ChoiceData.Sinorm=(scores-mean(scores))./std(scores);
Model_ChoiceData.StudentXsName={'male', 'private_hs'};
Model_ChoiceData.StudentXs=students{:,Model_ChoiceData.StudentXsName};
Model_ChoiceData.LocationXi=students.region;
Model_ChoiceData.AidAmount=students.aid_amount/1e+06;
Model_ChoiceData.XiReferenceTuition=students.reference_tuition;
Model_ChoiceData.appYear=students.proceso;
Model_ChoiceData.AppsTruthFullRank=students.AppsTruthFullRank; % indicator of which apps are within a window considered "truthful" 
Model_ChoiceData.AppsExPostWaitlistFeasibles=students.AppsExPostWaitlistFeasibles;

Model_ChoiceData.Index = Index;

Model_ChoiceData.NoApp=students.NoApp;
Model_ChoiceData.current_cohort=students.current_cohort;
Model_ChoiceData.old_cohort1=students.old_cohort1;
Model_ChoiceData.old_cohort2=students.old_cohort2;
Model_ChoiceData.old_cohort3=students.old_cohort3;
Model_ChoiceData.old_cohort4=students.old_cohort4;
Model_ChoiceData.old_cohort5=students.old_cohort5;
Model_ChoiceData.past_enrollment1=students.past_enrollment1;
Model_ChoiceData.past_enrollment2=students.past_enrollment2;
Model_ChoiceData.past_enrollment3=students.past_enrollment3;

% ApplicationsIndex == index in option matrix of applications 
Model_ChoiceData.ApplicationsIndex=students.AppsIndex;
% ApplicationsID == demrecode of applications
Model_ChoiceData.ApplicationsID=students.Apps;

Model_ChoiceData.AppsTruthFullRank=students.AppsTruthFullRank;
Model_ChoiceData.AppsCalledWaitlist=students.AppsExPostWaitlistFeasibles;
Model_ChoiceData.AssignedApp=students.AppsAssigned;

Model_ChoiceData.FeasibleAny=FeasibleAny;
Model_ChoiceData.ExPostFeasible=ExPostFeasible;
Model_ChoiceData.ExPostWaitlistFeasible=ExPostWaitlistFeasible;
Model_ChoiceData.FeasibleOff=FeasibleOff;



Model_ChoiceData.ChoiceIndexRank=students.ChoiceIndexRank;

Model_ChoiceData.enrollment=students.indexEnrollment;

Model_ChoiceData.grad6=students.grad6;


%% Adjust options available (Area,Major,UniqueMajor,Inst)
Model_ChoiceData.OptionID=options.cod;
Model_ChoiceData.optionYear=options.proceso;


%Adjustments to indexes
Model_ChoiceData.InstID=unique(options.Univ); %should delete options that aren't in the given years
tempInst=table2cell(unique(options(:,{'Univ','sigla_universidad'}))); %should delete options that aren't in the given years
listInst=cell2mat(tempInst(:,1));
InstName={};
for i=1:length(Model_ChoiceData.InstID)
    loc=find(Model_ChoiceData.InstID(i)==listInst);
    InstName{i,1}=tempInst{loc,2};
end
Model_ChoiceData.InstName=InstName;
Model_ChoiceData.InstCode=options.Univ;

Model_ChoiceData.MajorID=unique(options.FLcode_app); %should delete majors that aren't in the given years
tempMajorName=table2cell(unique(options(:,{'FLcode_app','MajorName'}))); %should delete majors that aren't in the given years

listMajor=cell2mat(tempMajorName(:,1));
MajorName={};
for i=1:length(Model_ChoiceData.MajorID)
    loc=find(Model_ChoiceData.MajorID(i)==listMajor);
    MajorName{i,1}=tempMajorName{loc,2};
end
Model_ChoiceData.MajorName=MajorName;
Model_ChoiceData.MajorCode=options.FLcode_app;


% Setup Option characteristics 
Model_ChoiceData.OptionSelectivity=options.Selectivity;
Model_ChoiceData.OptionSelectivity(isnan(Model_ChoiceData.OptionSelectivity))=prctile(Model_ChoiceData.OptionSelectivity,25);
Model_ChoiceData.OptionXj=[options{:,{'Cquant','Cqual'}}./10 (options{:,{'Cquant','Cqual'}}./10).^2 Model_ChoiceData.OptionSelectivity Model_ChoiceData.OptionSelectivity.^2] ;
Model_ChoiceData.XjName={'STEM','HUMA','STEM^2','HUMA^2','Selectivity','Selectivity^2'};
Model_ChoiceData.OptionXjnorm=(Model_ChoiceData.OptionXj-mean(Model_ChoiceData.OptionXj))./std(Model_ChoiceData.OptionXj);
Model_ChoiceData.LocationXj=options.Geo_RegionCode;
Model_ChoiceData.ArancelXj=(options.valor_arancel+options.valor_matricula)/1e+06;
Model_ChoiceData.ArancelReferencialXj=options.arancel_referencial_proxy/1e+06;

Model_ChoiceData.BVPj=(options.bvp==1 & options.Area==7 & options.proceso>=2011);
Model_ChoiceData.slots = options.slots;

typeIndex=NaN(length(options.cod),4);
for j=1:length(options.cod)
    area=find(options.Area(j)==unique(options.Area));
    major=find(options.FLcode_app(j)==unique(options.FLcode_app));
    inst=find(options.Univ(j)==unique(options.Univ));
    program=find(options.program(j)==unique(options.program));
    if isempty(area)==1 || isempty(major)==1 || isempty(inst)==1 || isempty(program)==1
    else
        typeIndex(j,:)=[area(1) major(1) inst(1) program(1)];
    end
end
dropOption=(sum(isnan(typeIndex),2)>0);
listDrop=find(dropOption==1);
typeIndex(listDrop,:)=[];

Model_ChoiceData.TypeName={'Area','Major','Institution','Program'}; 
Model_ChoiceData.TypeIndex=typeIndex; 

% add off platform flag for options of applicable
if nargin<5
   Model_ChoiceData.offPlatform = zeros(nOptions,1);
else
   Model_ChoiceData.offPlatform = options.platform==0;
end

% %Cleaning up
Model_ChoiceData.InstCode(dropOption==1,:)=[];
Model_ChoiceData.MajorCode(dropOption==1,:)=[];

Model_ChoiceData.OptionID(dropOption==1,:)=[];
Model_ChoiceData.optionYear(dropOption==1,:)=[];
Model_ChoiceData.LocationXj(dropOption==1,:)=[];

Model_ChoiceData.OptionXj(dropOption==1,:)=[];
Model_ChoiceData.ArancelXj(dropOption==1,:)=[];
Model_ChoiceData.slots(dropOption==1,:)=[];
Model_ChoiceData.OptionSelectivity(dropOption==1,:)=[];
Model_ChoiceData.OptionXjnorm(dropOption==1,:)=[];
Model_ChoiceData.Index(:,dropOption==1)=[];
Model_ChoiceData.ExPostFeasible(:,dropOption==1)=[];
Model_ChoiceData.ExPostWaitlistFeasible(:,dropOption==1)=[];
Model_ChoiceData.FeasibleAny(:,dropOption==1)=[];
end
